/***********************************************************************
* This program calcualte abnormal Google SVI measures for Table 8
************************************************************************/

/******************************************************************************
                                Prepare SVI data
******************************************************************************/
*   - to convert the overlapping windows into one entire window 
    - 1. overlapping obs are summed in each window and the ratio of the 
         two sums are calcualted as the adjustment factor to make the 
         two windows comparable
    - 2. repeat the above step for all windows until we have one time 
         series;
* 1 get a sense of how the ratio differes from day to day for overlapping obs;
data WORK.SVI    ;
   %let _EFIERR_ = 0; /* set the ERROR detection macro variable */
   infile 'data\svi.csv' delimiter = ',' MISSOVER DSD
   lrecl=32767 ;
      informat VAR1 $5. ;
      informat VAR2 ddmmyy10. ;
      informat VAR3 ddmmyy10. ;
      informat VAR4 best32. ;
      format VAR1 $5. ;
      format VAR2 mmddyy10. ;
      format VAR3 ddmmyy10. ;
      format VAR4 best12. ;
      input
            VAR1 $
            VAR2
            VAR3
            VAR4
     ;
      if _ERROR_ then call symputx('_EFIERR_',1);  /* set ERROR detection macro variable */
      run;

* rename the varible;
DATA svi;
  SET svi;
  RENAME var1 = ticker
         var2 = beg_date
         var3 = date
         var4 = svi;
  RUN; 
  ** Prep the data for next steps;
  * remove missing svi;
  data svi;
    set svi;
    if svi ne .;
    ticker = strip(ticker);
    run; 
  * sort;
  proc sort data = svi; by ticker date beg_date; run ;
  data svi;
    set svi;
    by ticker date;
    group +1; 
    if first.date then group =1 ;
    run; * group =1 means the first time that day appeared;

* reorganize the data;
proc sql;
  create table ticker_beg_date
  as select distinct ticker, beg_date
  from svi
  order by ticker, beg_date;
  quit;
data ticker_beg_date;
  set ticker_beg_date;
  by ticker beg_date;
  nwindow+1;
  if first.ticker then nwindow = 1;
  run; 
proc sql;
  create table lead_lag
  as select a.ticker, a.beg_date as lag, b.beg_date as lead,
    a.nwindow as lag_nwindow, b.nwindow as lead_nwindow
  from ticker_beg_date as a, ticker_beg_date as b
  where a.ticker = b.ticker and a.nwindow +1 = b.nwindow
  order by a.ticker, a.beg_date, b.beg_date;
  quit; 

* add obs of svi from two windows;
proc sql;
  * match variables to the lag_window;
  create table lag_svi
  as select distinct a.*, b.date, b.svi as lag_svi
  from lead_lag as a, svi as b
  where a.ticker = b.ticker and b.beg_date = a.lag;

  * match variables to the lead_window;
  create table lead_svi
  as select distinct a.*, b.date, b.svi as lead_svi
  from lead_lag as a, svi as b
  where a.ticker = b.ticker and b.beg_date = a.lead;

  * match both and only overlapping are left; 
  create table lead_lag_svi
  as select a.*, b.*
  from lag_svi as a, lead_svi as b
  where a.ticker = b.ticker and a.lag = b.lag
    and a.lead = b.lead and a.date = b.date
  order by a.ticker, a.lag, a.lead, a.date;
  quit; 
  * check the correlation;
  proc corr data = lead_lag_svi;
    var lag_svi lead_svi;
    run; * 0.82 correlation; 
* for each lead-lag pairs, ratio is the sum of lead_svi
  divided by the sum of the lag_svi;
proc sql;
  create table lag_lead_ratio
  as select distinct a.ticker, a.lag, a.lead, a.lag_nwindow,
    a.lead_nwindow, mean(lag_svi) as mean_lag_svi,
    mean(lead_svi) as mean_lead_svi, 
    mean(lag_svi)/mean(lead_svi) as lag_lead_ratio
  from lead_lag_svi as a
  group by ticker, lag, lead;
  quit; 

* summation and then divide shall greatly reduces the individual 
  noisies and should be superior than using individual ratio;
* Now calculate cumulative ratio so that every subsequent window 
  can be adjusted to the first window measures ever appeared;
  * latter windows may be associated with higher scalling errors
    as it is adjusted so many times; 
  * an alternative way is to adjust all obs to the permno-rdq1 
    window (the first time rdq1 appeared) after matching; 
* method 1: convert all windows to the scale of the first window;
data lag_lead_ratio_c1;
  set lag_lead_ratio;
  by ticker lag lead;
  retain cratio;
  if first.ticker then cratio =1;
  cratio = cratio*lag_lead_ratio;
  run;
data lag_lead_ratio_c1;
  set lag_lead_ratio_c1;
  by ticker lag lead;
  output;
  if first.ticker then do;
    lead = lag;
    lead_nwindow = lag_nwindow;
    mean_lead_svi = mean_lag_svi;
    lag_lead_ratio = 1;
    cratio = 1;
    output;
  end; 
  run; 
proc sort data = lag_lead_ratio_c1; by ticker lag lead; run; 
* method 2: convert all windows to a given rdq1 window;
  *create cumulative ratio w.r.t. a particualr window;
proc sql;
  create table ci
  as select distinct a.ticker, a.beg_date as i_window, 
    a.nwindow as ni, b.beg_date as j_window,
    b.nwindow as nj
  from ticker_beg_date as a, ticker_beg_date as b
  where a.ticker = b.ticker
  order by a.ticker, a.beg_date, b.beg_date;
  quit; 
proc sql;
  create table ij_ratio
  as select distinct a.*, b.cratio as c_i1, 
    c.cratio as c_j1,  c.cratio/b.cratio as c_ji
  from ci as a, lag_lead_ratio_c1 as b, lag_lead_ratio_c1 as c
  where a.ticker = b.ticker= c.ticker and a.i_window = b.lead
    and a.j_window = c.lead
  order by a.ticker, a.i_window, a.j_window;
  quit; 
* to adjust svi from window j to window i, multiply svi by c_ji;

* Now adjust all svi to the first window;
proc sql;
  create table svi_1
  as select a.*, a.svi*c_ji as svi_1
  from svi as a, ij_ratio as b
  where a.ticker = b.ticker and a.beg_date = b.j_window
    and b.ni = 1
  order by a.ticker, a.date, a.beg_date;
  quit ;
* Observation: svi in some winodws, even adjusted, can be
  far off from the rest of the observations
  this is especially true if one of the unadjusted svi = 0
  - t1: take only g=1
  - t2: take average across all groups
  - t3: similar to t2 but exclude all 0s
  - t4: similar to t2 but exclude maximum and minimum
  - t5: take median of all groups;
* t2 works the best;
proc sql;
  create table ngroups
  as select distinct *, count(group) as ngroups
  from svi_1
  group by ticker, date
  order by ticker, date;
  quit; 

proc sql;
  create table svi_t2
  as select distinct ticker, date, 
    beg_date, ngroups, svi_1,
    mean(svi_1) as svi_t2
  from ngroups
  group by ticker, date
  order by ticker, date;

data svi_t;
  merge svi_t2;
  by ticker date;
  run;

* select only trading days; 
proc sql;
  create table svi_tdays 
  as select a.*, b.number
  from svi_t as a, rep.tdays_2018 as b
  where a.date = b.date; 
  quit;  

* Download ticker to permno data; 
  %let wrds = wrds.wharton.upenn.edu 4016;
  options comamid=TCP remote=WRDS;
  signon username=_prompt_;
  rsubmit;
  proc download data = crsp.stocknames out = stocknames; run ;
  endrsubmit; 

  * libname crsp "E:\CRSP";
  * data stocknames; set crsp.stocknames; run;

* match with permno for the next steps;
proc sql;
  create table svi_tdays_permno
  as select a.*, b.permno, b.namedt, b.nameenddt
  from svi_tdays as a, stocknames as b
  where strip(a.ticker) = strip(b.ticker) and b.namedt<=a.date<=b.nameenddt
  order by b.permno, a.date;
  quit;



    * Get permno-rdq1 paris; 
    proc sql;
      create table permno_rdq1
      as select a.*, b.number 
      from (select distinct permno, rdq1, fyearq, fqtr
      from rep.earnings) as a, 
           rep.tdays as b
      where a.rdq1 = b.date and a.permno ne .
      order by permno, rdq1;
      quit; 

    ** Check how many trading days are between the current rdq1 and the next rdq1;
      * In case there are missing reports for the previous or the next qtr, set
        these varaibels to be missing as well; 
      * First, start with next rdq1;
      proc sort data = permno_rdq1; 
        by permno descending fyearq descending fqtr descending rdq1; 
        run; 
      data permno_rdq1;
        set permno_rdq1;
        by  permno descending fyearq descending fqtr descending rdq1;
        evttime_nrdq1 = lag(number) - number;
        next_rdq1 = lag(rdq1);
        if (fyearq - lag(fyearq))*4+ fqtr - lag(fqtr) ne -1 then do;
          evttime_nrdq1 = .;
          next_rdq1 = .;
        end; 
        if first.permno then do;
          evttime_nrdq1 = . ; 
          next_rdq1 = .;
        end; 
        format next_rdq1 date7.;
        run; 
      * Second, calcualte the lag rdq1;
      proc sort data = permno_rdq1; 
        by permno  fyearq  fqtr  rdq1; 
        run; 
      data permno_rdq1;
        set permno_rdq1;
        by  permno fyearq fqtr rdq1;
        evttime_lrdq1 = lag(number) - number;
        lag_rdq1 = lag(rdq1);
        if (fyearq - lag(fyearq))*4+ fqtr - lag(fqtr) ne 1 then do;
          evttime_lrdq1 = .;
          lag_rdq1 = .;
        end; 
        if first.permno then do;
          evttime_lrdq1 = . ; 
          lag_rdq1 = .;
        end; 
        format lag_rdq1 date7.;
        run; 




* match permno_rdq1 with svi_ts;
proc sql;
  create table ab_svi_t
  as select a.*, b.*, b.number - a.number as evttime
  from permno_rdq1 as a, svi_tdays_permno as b
  where a.permno = b.permno and 
    a.lag_rdq1 < b.date < a.next_rdq1
  order by a.permno, a.rdq1, b.date;
  quit;
* calcualte benchmark means;
* calculate v1 [-21, -1]
            v2 [-31, -1]
            v3 [-31, -11]
            v4 [-41, -11];
* use v4 [-41, -11];
proc sql;
  create table v4
  as select distinct permno, rdq1, 
    mean(log(1+svi_t2)) as logsvipre_v4_t2
    count(svi_t1) as countdays
  from ab_svi_t
  where -41<=evttime<=-11
  group by permno, rdq1
  /*having calculated countdays = 31*/
  order by permno, rdq1;
  quit; 
data all_svi_pre;
  set v4;
  by permno rdq1;
  run; 

* calcualte the demeaned svi;
proc sql;
  create table absvi_t
  as select a.*, b.*
  from ab_svi_t as a left join all_svi_pre as b
  on a.permno = b.permno and a.rdq1 = b.rdq1
  order by a.permno, a.fyearq, a.fqtr, a.rdq1, a.evttime;
  quit; 
* Demeaning;
data absvi_t1;  
  set absvi_t;
  logabsvi_v4_t2 = log(1+svi_t2) - logsvipre_v4_t2;
  run; 

* calculate the absvi over various windows; 


/******************************************************************************
                              Get Window Averages
******************************************************************************/
%macro wd(tag = );
* delete ffret: dataset; 
proc datasets library=work;
   delete ffret:;
run;
/*%let start = 0 2 2 2 2  2  2  2  2  2  2;*/
/*%let end   = 1 3 4 5 10 20 30 40 50 60 99;*/
%let start = %str(0);
%let end   = %str(1);
%let nwords=%sysfunc(countw(&start));
%do i=1 %to &nwords;
  %let s = %qscan(&start,&i,%str(,));
  %let e = %scan(&end, &i);
  %let d = %eval(&e-&s+1);

  %let s0 = %sysfunc(abs(&s));
  %let e0 = %sysfunc(abs(&e));
  %if &e<0 %then %let e0 =_&e0;
  %if &s<0 %then %let s0 =_&s0;

    proc sql;
      create table ffret&i
        as select distinct permno, rdq1,
          mean(logabsvi_v4_t2) as logabsvi&s0.&e0._v4_t2._&tag,
          count(logabsvi_v4_t2) as countdays
        from absvi_t1
        where &s<=evttime<= &e and evttime_nrdq1-1 >= &e
        group by permno, rdq1
        having calculated countdays = &d
        order by permno, rdq1;
      quit; 
%end; 

* from t+1 until next rdq1-5, excluding pre-annoucement leakage of next rdq1; 
%let s0 = 2;
%let e0 = 95;
proc sql;
  create table ffret0_2
        as select distinct permno, rdq1,
           mean(logabsvi_v4_t2) as logabsvi&s0.&e0._v4_t2._&tag,
           count(logabsvi_v4_t2) as countdays

    from absvi_t1
    where 2<=evttime<= evttime_nrdq1-5
    group by permno, rdq1
    having calculated countdays = evttime_nrdq1-6
    order by permno, rdq1;
quit;

data all_merged;
  merge ffret:;
  by permno rdq1;
  drop countdays;
  run; 
%mend; 
%wd(tag=crrted);  

/******************************************************************************
                              Get SVI Persistence
******************************************************************************/

%macro persis_absvi(input = , svinames = , end = , lag = , out = );
  /*%let end = next_rdq1-rdq1-5; */
  data time_series4;
    set &input;
    %do i=1 %to %sysfunc(countw(&svinames));
      %let name&i = %scan(&svinames, &i, %str( ));
      if &&name&i ne .;
    %end;
    keep rdq1 permno &svinames evttime;
    if evttime>=0 and evttime<=&end;  
  /*  if permno = 18067;*/
    run; 
  /*data check; set time_series4; if missing(sev_vom_minus_plus_ni); run ;*/
  * Make sure there are enough distinct values for each run;
  proc sql;
    create table ret_count
    as select distinct permno, rdq1, &name1
    from time_series4;

    create table time_series5
    as select distinct a.*, b.ret_count
    from time_series4 as a, 
         (select distinct permno, rdq1, count(&name1) 
          as ret_count from ret_count group by permno,
          rdq1) as b
    where a.permno = b.permno and a.rdq1 = b.rdq1 and b.ret_count>=10
    order by permno, rdq1, evttime;
    quit; 
  option nonotes; 
  proc iml;
  use time_series5;
  read all var{permno rdq1} into firm;
  %do i=1 %to %sysfunc(countw(&svinames));
    %let name&i = %scan(&svinames, &i, %str( ));
    read all var{&&name&i} into v&i;
  %end;/*read all var{vomh_ab} into vomh;*/
  close;
  %let svinames_lgth = %sysfunc(countw(&svinames));
  %let n1 = %eval(&svinames_lgth+1);
  %let n2 = %eval(&svinames_lgth+2);
  b = uniqueby(firm, 1:2);  
  lala = j(nrow(b),&n2, 99);         /* 3. Allocate vector to hold results */
  lala[,&n1:&n2] = firm[b,1:2];
  b = b // (nrow(firm)+1);     /* trick: append (n+1) to end of b */
  do i = 1 to nrow(b)-1;    /* 4. For each level... */
  /*do i = 1 to 1000;*/
     idx = b[i]:(b[i+1]-1); /* 5. Find observations in level */
     if (b[i+1]-b[i])>=30 then do;
      %do j=1 %to %sysfunc(countw(&svinames));  
         v&j._dm = v&j[idx]-mean(v&j[idx]);
         call farmafit(d1,phi,ma,sigma,v&j._dm); 
         lala[i,&j] = d1; 
      %end;
     end;
  end;


  create mydat4 from lala;
  append from lala;
  close mydat4;

  quit;
  option notes;
  data &out; set mydat4; 
    rename col&n1 = permno; rename col&n2 = rdq1;
    format col&n2 date9.;
     %do i =1 %to %sysfunc(countw(&svinames));  
        %let name&i = %scan(&svinames, &i, %str( ));
        rename col&i = d_&&name&i;
     %end;
    run;
  proc sort data = &out; by permno rdq1; run ;
%mend; 
%let namelist = logabsvi_v4_t2
%persis_absvi(input = absvi_t1, svinames = &namelist, end = evttime_nrdq1, lag = 5, out = d_logsvi_t);

/******************************************************************************
                               Combine and Merge
******************************************************************************/
data rep.svi;
  merge all_merged d_logsvi_t;
  by permno rdq1;
  run; 


